import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import yaml
import os
import warnings
import matplotlib as plt
warnings.filterwarnings('ignore')
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
with open("../data/resources/rcParams.yaml") as f:
rcParamsDict = yaml.full_load(f)
for k in rcParamsDict["rcParams"]:
print("{} {}".format(k,rcParamsDict["rcParams"][k]))
plt.rcParams[k] = rcParamsDict["rcParams"][k]
for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
print("{} {}".format(k1,rcParamsDict[k1]))
scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.22.2 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3 figure.dpi 50 savefig.dpi 500 figure.figsize [10, 10] axes.facecolor white dotSize 20
DSname="UpD50"
DSDirName="Sample_S20272_157"
outdir = "../data/output"
if not os.path.exists(outdir):
# Create a new directory because it does not exist
os.makedirs(outdir)
if not os.path.exists("../data/output/adatas"):
# Create a new directory because it does not exist
os.makedirs("../data/output/adatas")
with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
results_file = outdir+'/'+DSname+'.h5ad'
scanpyObj = sc.read_10x_mtx('../data/'+DSDirName+'/filtered_feature_bc_matrix/', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
scanpyObj.obs['dataset'] = DSname
scanpyObj.obs_names = [i + "_" + j for i, j in zip(scanpyObj.obs_names.tolist(), scanpyObj.obs["dataset"].tolist())]
... reading from cache file cache/..-data-Sample_S20272_157-filtered_feature_bc_matrix-matrix.h5ad
scanpyObj.var_names_make_unique()
scanpyObj.obs_names_make_unique()
cellID = pd.read_csv('../data/'+DSDirName+'/aggregatedCall/aggregatedCall.tsv', sep ="\t",index_col = 0)
cellID
cellID.index = [i + "_" + DSname for i in cellID.index.tolist()]
cellID.Consensus.unique()
array(['809', '3391B', 'KOLF', 'MIFF1', 'doublet', 'LowQuality'],
dtype=object)
scanpyObj.obs['cellID'] = cellID.loc[scanpyObj.obs_names, 'Consensus']
scanpyObj.obs
| dataset | cellID | |
|---|---|---|
| AAACCTGAGATGTGGC-1_UpD50 | UpD50 | 809 |
| AAACCTGAGCAACGGT-1_UpD50 | UpD50 | doublet |
| AAACCTGAGCGGATCA-1_UpD50 | UpD50 | 3391B |
| AAACCTGAGCGTAATA-1_UpD50 | UpD50 | 3391B |
| AAACCTGAGTCGCCGT-1_UpD50 | UpD50 | doublet |
| ... | ... | ... |
| TTTGTCATCGCCTGTT-1_UpD50 | UpD50 | 3391B |
| TTTGTCATCGCGTAGC-1_UpD50 | UpD50 | 3391B |
| TTTGTCATCGGTGTTA-1_UpD50 | UpD50 | 809 |
| TTTGTCATCGTTTATC-1_UpD50 | UpD50 | 809 |
| TTTGTCATCTCTTGAT-1_UpD50 | UpD50 | 809 |
17100 rows × 2 columns
cellID_colors = {}
cellID_newName_colors = {}
cellID_newNames = {}
for line in iPSC_lines_map.keys():
cellID_colors[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["color"]
cellID_newName_colors[iPSC_lines_map[line]["newName"]] = iPSC_lines_map[line]["color"]
cellID_newNames[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["newName"]
scanpyObj.obs["cellID"] = scanpyObj.obs["cellID"].astype("category")
scanpyObj.obs["cellID_newName"] = scanpyObj.obs["cellID"].replace(cellID_newNames, inplace=False).astype("category")
scanpyObj.uns["cellID_colors"] = [cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
scanpyObj.uns["cellID_newName_colors"] = [cellID_newName_colors[line] for line in scanpyObj.obs["cellID_newName"].cat.categories]
[cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
['#DBB807', '#FF0054', '#0FB248', '#a8a5a5', '#7B00FF', '#403b3b']
Show those genes that yield the highest fraction of counts in each single cells, across all cells.
sc.pl.highest_expr_genes(scanpyObj, n_top=20 )
normalizing counts per cell
finished (0:00:00)
goodBarcodes=pd.read_csv(outdir+'/'+DSname+'_filteredCells.tsv', sep="\t")["BCs"]+"_"+DSname
scanpyObj = scanpyObj[goodBarcodes,:]
scanpyObj.var['mt'] = scanpyObj.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)
scanpyObj.var['ribo'] = scanpyObj.var_names.str.startswith('RP') # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['ribo'], percent_top=None, log1p=True, inplace=True)
scanpyObj
AnnData object with n_obs × n_vars = 10586 × 33538
obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo'
uns: 'cellID_colors', 'cellID_newName_colors'
scanpyObj.var
| gene_ids | feature_types | mt | n_cells_by_counts | mean_counts | log1p_mean_counts | pct_dropout_by_counts | total_counts | log1p_total_counts | ribo | |
|---|---|---|---|---|---|---|---|---|---|---|
| MIR1302-2HG | ENSG00000243485 | Gene Expression | False | 2 | 0.000189 | 0.000189 | 99.981107 | 2.0 | 1.098612 | False |
| FAM138A | ENSG00000237613 | Gene Expression | False | 0 | 0.000000 | 0.000000 | 100.000000 | 0.0 | 0.000000 | False |
| OR4F5 | ENSG00000186092 | Gene Expression | False | 1 | 0.000094 | 0.000094 | 99.990554 | 1.0 | 0.693147 | False |
| AL627309.1 | ENSG00000238009 | Gene Expression | False | 14 | 0.001323 | 0.001322 | 99.867750 | 14.0 | 2.708050 | False |
| AL627309.3 | ENSG00000239945 | Gene Expression | False | 3 | 0.000283 | 0.000283 | 99.971661 | 3.0 | 1.386294 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| AC233755.2 | ENSG00000277856 | Gene Expression | False | 0 | 0.000000 | 0.000000 | 100.000000 | 0.0 | 0.000000 | False |
| AC233755.1 | ENSG00000275063 | Gene Expression | False | 0 | 0.000000 | 0.000000 | 100.000000 | 0.0 | 0.000000 | False |
| AC240274.1 | ENSG00000271254 | Gene Expression | False | 419 | 0.041375 | 0.040542 | 96.041942 | 438.0 | 6.084499 | False |
| AC213203.1 | ENSG00000277475 | Gene Expression | False | 0 | 0.000000 | 0.000000 | 100.000000 | 0.0 | 0.000000 | False |
| FAM231C | ENSG00000268674 | Gene Expression | False | 1 | 0.000094 | 0.000094 | 99.990554 | 1.0 | 0.693147 | False |
33538 rows × 10 columns
scanpyObj.obs
| dataset | cellID | cellID_newName | n_genes_by_counts | log1p_n_genes_by_counts | total_counts | log1p_total_counts | total_counts_mt | log1p_total_counts_mt | pct_counts_mt | total_counts_ribo | log1p_total_counts_ribo | pct_counts_ribo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCTGAGATGTGGC-1_UpD50 | UpD50 | 809 | CTL04E | 1802 | 7.497207 | 4326.0 | 8.372630 | 122.0 | 4.812184 | 2.820157 | 1123.0 | 7.024649 | 25.959316 |
| AAACCTGAGCGGATCA-1_UpD50 | UpD50 | 3391B | CTL01 | 2139 | 7.668561 | 7216.0 | 8.884194 | 68.0 | 4.234107 | 0.942350 | 2753.0 | 7.920810 | 38.151329 |
| AAACCTGAGTGCCAGA-1_UpD50 | UpD50 | 809 | CTL04E | 1222 | 7.109062 | 2554.0 | 7.845808 | 17.0 | 2.890372 | 0.665623 | 566.0 | 6.340359 | 22.161316 |
| AAACCTGCAAGCGCTC-1_UpD50 | UpD50 | 809 | CTL04E | 1568 | 7.358194 | 4419.0 | 8.393895 | 45.0 | 3.828641 | 1.018330 | 997.0 | 6.905753 | 22.561665 |
| AAACCTGCACACATGT-1_UpD50 | UpD50 | 3391B | CTL01 | 3299 | 8.101678 | 8716.0 | 9.073030 | 35.0 | 3.583519 | 0.401560 | 1379.0 | 7.229839 | 15.821478 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTTGTCATCGCCTGTT-1_UpD50 | UpD50 | 3391B | CTL01 | 1361 | 7.216709 | 2898.0 | 7.972121 | 20.0 | 3.044523 | 0.690131 | 628.0 | 6.444131 | 21.670118 |
| TTTGTCATCGCGTAGC-1_UpD50 | UpD50 | 3391B | CTL01 | 1203 | 7.093405 | 2296.0 | 7.739359 | 11.0 | 2.484907 | 0.479094 | 547.0 | 6.306275 | 23.824041 |
| TTTGTCATCGGTGTTA-1_UpD50 | UpD50 | 809 | CTL04E | 849 | 6.745236 | 1451.0 | 7.280697 | 17.0 | 2.890372 | 1.171606 | 179.0 | 5.192957 | 12.336320 |
| TTTGTCATCGTTTATC-1_UpD50 | UpD50 | 809 | CTL04E | 2815 | 7.943073 | 8304.0 | 9.024613 | 171.0 | 5.147494 | 2.059248 | 2092.0 | 7.646354 | 25.192678 |
| TTTGTCATCTCTTGAT-1_UpD50 | UpD50 | 809 | CTL04E | 1829 | 7.512071 | 3861.0 | 8.258941 | 184.0 | 5.220356 | 4.765605 | 473.0 | 6.161207 | 12.250712 |
10586 rows × 13 columns
sc.pl.violin(scanpyObj, ['n_genes_by_counts', 'total_counts'],
groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
sc.pl.violin(scanpyObj, ['total_counts_mt','total_counts_ribo'],
groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_ribo')
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
groupby= "cellID", jitter=0.4, multi_panel=True)
sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
groupby= "cellID", jitter=0.4, multi_panel=True)
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
scanpyObj.write_h5ad(outdir+'/adatas/'+DSname+'_raw.h5ad')
sc.pp.normalize_total(scanpyObj, exclude_highly_expressed=True, max_fraction=.1)
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['MALAT1']
finished (0:00:00)
sc.pp.log1p(scanpyObj)
scanpyObj.raw = scanpyObj
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated.txt", sep = "\t")["HVG"]
scanpyObj = scanpyObj[:,HVGs]
scanpyObj.var["highly_variable"] = True
#sc.pp.highly_variable_genes(scanpyObj, min_mean=0.0125, max_mean=5, min_disp=0.5)
#scanpyObj = scanpyObj[:, HVG.tolist()]
#Multiplexing = Multiplexing[:, Multiplexing.var.highly_variable]
#sc.pl.highly_variable_genes(scanpyObj)
Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.
sc.pp.regress_out(scanpyObj, ['total_counts',"pct_counts_mt"])
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
finished (0:00:27)
Scale each gene to unit variance. Clip values exceeding standard deviation 10.
sc.pp.scale(scanpyObj, max_value=20)
scanpyObj.obs
| dataset | cellID | cellID_newName | n_genes_by_counts | log1p_n_genes_by_counts | total_counts | log1p_total_counts | total_counts_mt | log1p_total_counts_mt | pct_counts_mt | total_counts_ribo | log1p_total_counts_ribo | pct_counts_ribo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCTGAGATGTGGC-1_UpD50 | UpD50 | 809 | CTL04E | 1802 | 7.497207 | 4326.0 | 8.372630 | 122.0 | 4.812184 | 2.820157 | 1123.0 | 7.024649 | 25.959316 |
| AAACCTGAGCGGATCA-1_UpD50 | UpD50 | 3391B | CTL01 | 2139 | 7.668561 | 7216.0 | 8.884194 | 68.0 | 4.234107 | 0.942350 | 2753.0 | 7.920810 | 38.151329 |
| AAACCTGAGTGCCAGA-1_UpD50 | UpD50 | 809 | CTL04E | 1222 | 7.109062 | 2554.0 | 7.845808 | 17.0 | 2.890372 | 0.665623 | 566.0 | 6.340359 | 22.161316 |
| AAACCTGCAAGCGCTC-1_UpD50 | UpD50 | 809 | CTL04E | 1568 | 7.358194 | 4419.0 | 8.393895 | 45.0 | 3.828641 | 1.018330 | 997.0 | 6.905753 | 22.561665 |
| AAACCTGCACACATGT-1_UpD50 | UpD50 | 3391B | CTL01 | 3299 | 8.101678 | 8716.0 | 9.073030 | 35.0 | 3.583519 | 0.401560 | 1379.0 | 7.229839 | 15.821478 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTTGTCATCGCCTGTT-1_UpD50 | UpD50 | 3391B | CTL01 | 1361 | 7.216709 | 2898.0 | 7.972121 | 20.0 | 3.044523 | 0.690131 | 628.0 | 6.444131 | 21.670118 |
| TTTGTCATCGCGTAGC-1_UpD50 | UpD50 | 3391B | CTL01 | 1203 | 7.093405 | 2296.0 | 7.739359 | 11.0 | 2.484907 | 0.479094 | 547.0 | 6.306275 | 23.824041 |
| TTTGTCATCGGTGTTA-1_UpD50 | UpD50 | 809 | CTL04E | 849 | 6.745236 | 1451.0 | 7.280697 | 17.0 | 2.890372 | 1.171606 | 179.0 | 5.192957 | 12.336320 |
| TTTGTCATCGTTTATC-1_UpD50 | UpD50 | 809 | CTL04E | 2815 | 7.943073 | 8304.0 | 9.024613 | 171.0 | 5.147494 | 2.059248 | 2092.0 | 7.646354 | 25.192678 |
| TTTGTCATCTCTTGAT-1_UpD50 | UpD50 | 809 | CTL04E | 1829 | 7.512071 | 3861.0 | 8.258941 | 184.0 | 5.220356 | 4.765605 | 473.0 | 6.161207 | 12.250712 |
10586 rows × 13 columns
sc.tl.pca(scanpyObj, svd_solver='arpack')
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:04)
sc.pl.pca(scanpyObj, color='MKI67')
sc.pl.pca_variance_ratio(scanpyObj, log=True)
scanpyObj
AnnData object with n_obs × n_vars = 10586 × 3499
obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca'
obsm: 'X_pca'
varm: 'PCs'
sc.pp.neighbors(scanpyObj, n_neighbors=10, n_pcs=9)
computing neighbors
using 'X_pca' with n_pcs = 9
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
sc.tl.umap(scanpyObj)
scanpyObj
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:04)
AnnData object with n_obs × n_vars = 10586 × 3499
obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca', 'neighbors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
sc.pl.umap(scanpyObj, color=['MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
sc.tl.diffmap(scanpyObj)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
finished (0:00:00)
eigenvalues of transition matrix
[1. 0.996754 0.99504185 0.9920966 0.9898147 0.9879345
0.98735774 0.98407966 0.9831898 0.97786796 0.97119904 0.96953475
0.9642359 0.9634919 0.9609128 ]
finished: added
'X_diffmap', diffmap coordinates (adata.obsm)
'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
sc.tl.dpt(scanpyObj)
WARNING: No root cell found. To compute pseudotime, pass the index or expression vector of a root cell, one of:
adata.uns['iroot'] = root_cell_index
adata.var['xroot'] = adata[root_cell_name, :].X
computing Diffusion Pseudotime using n_dcs=10
finished: added
(0:00:00)
sc.pl.diffmap(scanpyObj, color=[ 'MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
scanpyObj.X.max()
20.0